In [1]:
%pylab inline
In [2]:
from classy import text
In [3]:
count,feature_names=text.count_letters('data/languages/E3.txt')
print((count,feature_names))
In [4]:
count,feature_names=text.count_letters('data/languages/E3.txt')
print((count,feature_names))
p=text.letter_freq('English',feature_names)
print(p)
In [5]:
print((sum(count*log10(p))))
In [6]:
C=text.LanguageFileClassifier()
In [7]:
result=C.loglikelihood('data/languages/E*.txt',verbose=True)
In [8]:
C.predict('data/languages/E*',verbose=True)
Out[8]:
In [9]:
[C.target_names[i] for i in C.predict('data/languages/E*')]
Out[9]:
In [10]:
from classy import text
In [11]:
train=text.load_files('data/films/train',verbose=True)
test=text.load_files('data/films/test',verbose=True)
In [12]:
train,test=text.text_to_vectors('data/films/train','data/films/test',verbose=True)
In [13]:
train.vectors
Out[13]:
In [14]:
v=array(train.vectors[0,:].todense()).ravel()
In [8]:
v.max()
Out[8]:
In [9]:
v.shape
Out[9]:
In [13]:
v=array(train.vectors[0,:].todense()).ravel()
plot(v,'.')
v=array(train.vectors[10,:].todense()).ravel()
plot(v,'.')
xlabel('feature number')
ylabel('frequency of feature')
Out[13]:
In [14]:
train.vectors.shape
Out[14]:
In [15]:
C=text.Multinomial()
In [16]:
C.fit(train.vectors,train.targets)
Out[16]:
In [17]:
C.predict(test.vectors)
Out[17]:
In [18]:
C.percent_correct(test.vectors,test.targets)
Out[18]:
In [19]:
from classy import *
In [20]:
train_files=text.load_files('data/films/train',verbose=True)
test_files=text.load_files('data/films/test',verbose=True)
In [21]:
train_data,test_data=text.text_to_vectors(train_files,test_files,verbose=True)
In [22]:
train_data.vectors
Out[22]:
In [23]:
vectors_to_image(train_data.vectors,binary=True)
In [24]:
vectors_to_image(train_data.vectors,binary=False)
In [25]:
from classy import text
In [26]:
train_files=text.load_files('data/films/train',verbose=True)
test_files=text.load_files('data/films/test',verbose=True)
In [27]:
train_data,test_data=text.text_to_vectors(train_files,test_files,verbose=True)
In [28]:
train_data.vectors
Out[28]:
specify the ngram_range - the smallest ngram to use, and the largest. the default is (1,1), so only 1-grams are used. this example calculates the 1-gram and the 2-gram (bi-gram)
In [29]:
train_data,test_data=text.text_to_vectors(train_files,test_files,ngram_range=(1,2),verbose=True)
train_data.vectors
Out[29]:
In [30]:
print((train_data.feature_names[:100]))
In [ ]:
In [ ]: